Librares
library(mosaic)
library(tidyverse)
library(plotly)
library(reticulate)library(mosaic)
library(tidyverse)
library(plotly)
library(reticulate)food <- read_csv("data/food.csv")
food1 <- food %>%
select(c(GPA, father_education, mother_education))
food2 <- food1 %>%
mutate(
parent_education = case_when(
mother_education == 1 & father_education == 1 ~
"BHSD",
mother_education %in% c(1,2) & father_education == 2 |
mother_education == 2 & father_education == 1 ~
"1HS",
mother_education %in% c(1,2,3) & father_education == 3 |
mother_education == 3 & father_education %in% c(1,2) ~
"1SC",
mother_education %in% c(1,2,3,4) & father_education == 4 |
mother_education == 4 & father_education %in% c(1,2,3) ~
"1BD",
mother_education %in% c(1,2,3,4) & father_education == 5 |
mother_education == 5 & father_education %in% c(1,2,3,4) ~
"1GD",
mother_education == 5 & father_education == 5 ~
"BGD"
),
GPA = as.numeric(GPA)
) %>%
na.omit()
food2 <- food2 %>%
mutate(
parent_education = fct_relevel(parent_education,
"BHSD", "1HS", "1SC",
"1BD", "1GD", "BGD")
)
ggplot(food2, aes(x=parent_education, y=GPA)) +
geom_boxplot(fill=c('cyan3','deepskyblue','deepskyblue1','deepskyblue2','deepskyblue3','deepskyblue4')) +
geom_jitter(color="black", size=1, alpha=0.9, width= 0.25) +
labs(title= "How do Parent's Education Levels Effect College Student's GPAs?",
x= "Parent's Education Levels") +
theme_light()Both High school Dropouts (BHSD)
At least one high school graduate (1HS)
At least one did some college (1SC)
At least one bachelors degree (1BD)
At least one graduate degree (1GD)
Both graduate degrees (BGD)
To see full analysis this chart was used in: College GPA vs Parental Education
Rent <- read_csv("data/rent.csv")
wRent <- Rent %>%
filter(Gender == 'F' & Price < 1000) %>%
mutate(
MilesToCampus = round(MilesToCampus, 2)
)
plot_ly(wRent,
x= ~MilesToCampus,
y= ~Capacity,
color=~Price,
colors=c("hotpink","hotpink4"),
size= ~Price,
text= ~paste(Apartment, "\n$", Price)) %>%
layout(title= "Womens BYU-I Approved Housing\nUnder $1000 per Semester",
xaxis=list(title="Miles to the Center of Campus"),
yaxis=list(title="Maximum Housing Capacity"))To see full analysis this chart was used in: Housing Analysis for Stephanie
import pandas as pd
import numpy as np
import altair as alt
from IPython.display import Markdown
from IPython.display import display
from tabulate import tabulatenames = pd.read_csv("https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv")
christian_names = names.query("name == ['Mary', 'Martha', 'Peter', 'Paul']")
christian_chart = alt.Chart(christian_names,
title = alt.Title(
"People Born Each Year",
subtitle= "with the names 'Martha', 'Mary', 'Paul', and 'Peter'"
)
).encode(
x = alt.X('year',
title = "Year")
.axis(format = "d"),
y = alt.Y('Total'),
color = 'name'
)
christian_chart.mark_line()flights = pd.read_json("https://github.com/byuidatascience/data4missing/raw/master/data-raw/flights_missing/flights_missing.json")
# Gets rid of characters and just leaves numbers
flights['num_of_delays_carrier'] = (
flights['num_of_delays_carrier'].str.replace(r'\D', '', regex=True)
)
# Replaces blank strings, -999, and "n/a" with the actual NaN value
flights = (flights
.replace(["", -999, "n/a"], np.nan)
.replace(["Febuary"], "February")
)
# Fills NaN values in num_of_delays_late_aircraft with the mean of the column
mean_late_air = flights.num_of_delays_late_aircraft.mean()
flights.num_of_delays_late_aircraft.fillna(mean_late_air, inplace=True)
# Fills NaN values with the month before them
flights.month.ffill(inplace=True)
totals = (flights
.groupby("airport_code")
.agg(
total_minutes_delayed =
("minutes_delayed_total", np.sum),
total_delays =
("num_of_delays_total", np.sum),
total_flights =
("num_of_flights_total", np.sum),
).assign(
total_hrs_delayed = lambda df: df.total_minutes_delayed / 60,
ave_hrs_delayed = lambda df: df.total_hrs_delayed / df.total_delays,
proportion_delayed = lambda df: df.total_delays / df.total_flights,
delay_rating = lambda df: df.proportion_delayed * df.ave_hrs_delayed
).sort_values('delay_rating', ascending=False)
.reset_index()
)
best_airport = alt.Chart(totals,
title= alt.Title(
"Airports Rated by Delay",
subtitle= "The higher the rating, the worse airport")).encode(
x = alt.X('delay_rating:Q', title="Delay Rating"),
y= alt.Y('airport_code:N', title="Airport Code", sort="-x"),
color=alt.Color('airport_code:N', legend=None).scale(scheme="tealblues")
).mark_bar()
best_airport